/*
  wiring_pulse.s - pulseInASM() function in different flavours
  Part of Arduino - http://www.arduino.cc/

  Copyright (c) 2014 Martino Facchin, 2020-2022 Spence Konde

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General
  Public License along with this library; if not, write to the
  Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  Boston, MA  02111-1307  USA
*/

/*
 * The following routine was generated by avr-gcc 4.8.3 with the following parameters
 * -gstabs -Wa,-ahlmsd=output.lst -dp -fverbose-asm -O2
 * on the original C function
 *
 * unsigned long pulseInSimpl(volatile uint8_t *port, uint8_t bit, uint8_t stateMask, unsigned long maxloops)
 * {
 *     unsigned long width = 0;
 *     // wait for any previous pulse to end
 *     while ((*port & bit) == stateMask)
 *         if (--maxloops == 0)
 *             return 0;
 *
 *     // wait for the pulse to start
 *     while ((*port & bit) != stateMask)
 *         if (--maxloops == 0)
 *             return 0;
 *
 *     // wait for the pulse to stop
 *     while ((*port & bit) == stateMask) {
 *         if (++width == maxloops)
 *             return 0;
 *     }
 *     return width;
 * }
 *
 * some compiler outputs were removed but the rest of the code is untouched
 *
 * Spence, Feb 2020: not untouched anymore! The first two loops ran in 11 cycles instead
 * of 16, so if no pulse was detected, the timeout would be reached when
 * 11/16ths of the requested timeout had elapsed. This was fixed by the addition
 * of 2 rjmps to the next line (for a 2 cycle nop that uses only 1 word) and 1 nop
 * to each of these loops before they decrement maxloops.
 * Additionally, removed duplication of return sequence to save 12b flash
 * which is conveniently exactly how much the other fix cost.
 * Feb 2021: Made it beautiful, and made previous added delays into
 * rjmp .+0 instead of jumping to label placed right ahead of them.
 * Jan 2022: Fix a typo added last feb. Saved 8 bytes of flash by using movw instead of
 * consecutive movs of adjacent registers and in one place in place of a pair of ldi 0
 * immediately following ldi of 0 to two other registers at the end.
 * Removed unused labels.
 * Made the comments useful. It looks like this was compiled for an absolutely ancient chip.
 * I think use of sbiw and movw could save another 4 words (8 bytes). Could also save 3 more
 * and lose a nop from each loop. Except then, it would be 15 clocks per, and the uglier math at
 * common clock speeds would cost more than that.
 * I still have no idea why this is a separate .s file instead of some plain old inline assembly.
 * Except maybe that nobody charged with dealing with this knew how to write assembly?
 */

 /* Registers:
  * r12:r15 - temporary storage of result.
  * r16:r19 - stores remaining timeout time. In first two phases we decrement this.
  *           In final phase, we increment r12:r15 while comparing with this.
  * r24:r25 - port pointer address copied to Z.
  * r20:    - positive pulse ? bitmask : 0
  * r22:    - bitmask
  *
  */

#include <avr/io.h>

.section .text

.global countPulseASM

countPulseASM:

.LM0:
.LFBB1:
    push r12   ;   ;  130 pushqi1/1 [length = 1]
    push r13   ;   ;  131 pushqi1/1 [length = 1]
    push r14   ;   ;  132 pushqi1/1 [length = 1]
    push r15   ;   ;  133 pushqi1/1 [length = 1]
    push r16   ;   ;  134 pushqi1/1 [length = 1]
    push r17   ;   ;  135 pushqi1/1 [length = 1]
/* prologue: function */
    push r12;
    push r13;
    push r14;
    push r15;
    push r16;
    push r17;
/* frame size = 0 */
/* stack size = 6 */
.L__stack_usage = 6
    movw r30,r24  ; this was a mov instead of movw. Whyyy?

/*     unsigned long width = 0;
***     // wait for any previous pulse to end
***     while ((*port & bit) == stateMask)
*/
    rjmp .L2; Start in middle of the loop - more efficient that way.
/* START LOOP ONE - WAIT FOR PREVIOUS PULSE TO END */
.L4:
/*         if (--maxloops == 0) */
    rjmp .+0    ; waste an extra 5 cycles
    rjmp .+0    ;
    nop;
    subi r16, 1 ; Decrement maxloops
    sbc r17, r1 ; could these two be a sbiw? I think so... would save a word.
    sbc r18, r1 ;
    sbc r19, r1 ;
    breq .L13   ; If that was zero, we timed out, so branch to where we zero out the value we return.
.L2:
/*         if (--maxloops == 0) */
    ld r25,Z    ; Read the port
    and r25,r22 ; bitwise and with the pin bitmask we are looking at
    cp r25,r20  ; and compare it to what the pulse would be.
    breq .L4    ; if equal, we are still waiting for a pulse that was in progress when we got here to end.
    rjmp .L6    ; otherwise to the middle of the second loop.
/* END LOOP ONE - PREVIOUS PULSE ENDED - START LOOP TWO */
.L7:
/*             return 0;
***
***     // wait for the pulse to start
***     while ((*port & bit) != stateMask)
***         if (--maxloops == 0)
*/
    rjmp .+0     ; waste an extra 5 cycles
    rjmp .+0     ;
    nop          ;
    subi r16,1   ; Decrement maxloops
    sbc r17, r1  ;
    sbc r18, r1  ;
    sbc r19, r1  ;
    breq .L13    ; If that was zero, we timed out, so branch to where we zero out the value we return.
.L6:
/*         if (--maxloops == 0) */
    ld r25,Z     ; Read the port
    and r25,r22  ; bitwise and with the pin bitmask we are looking at
    cpse r25,r20 ; and compare it to what the pulse would be
    rjmp .L7     ; skipped if equal, in which case the pulse has started...
/* END LOOP TWO - PULSE HAS STARTED */
    mov r12, r1  ; and we zero out the registers where we store the result
    mov r13, r1  ;
    movw r14, r12 ;
/*  SAVE A WORD - replace preceding pair of mov isns, mov r14, r1, and this one,  mov r15, r1 with a single movw. */
    rjmp .L9     ; and jump to the middle of the third loop...
/* START LOOP THREE - TIMING THE PULSE */
.L10:
/*             return 0;
***
***     // wait for the pulse to stop
***     while ((*port & bit) == stateMask) {
***         if (++width == maxloops)
*/
    ldi r24,-1   ; If this was not timing critical, we could save 2 bytes and a clock with subi.
    sub r12,r24  ; this is an add-one...
    sbc r13,r24  ;
    sbc r14,r24  ;
    sbc r15,r24  ;
    cp r16,r12   ;  compare with remaining time until timeout
    cpc r17,r13  ;
    cpc r18,r14  ;
    cpc r19,r15  ;
    breq .L13    ; pulse has not ended before timeout, so measurement would be wrong. Return 0.
.L9:
/*         if (++width == maxloops) */
    ld r24,Z     ; Read the port
    and r24,r22  ; bitwise and with the pin bitmask we are looking at
    cp r24,r20   ; and compare it to what the pulse would be
    breq .L10    ; As long as they are the same we restart this loop and keep going

/*             return 0;
***     }
***     return width;
*/
    movw r22, r12; Pulse ended before timeout - return result. These were previously 4 movs. Whyyyy?
    movw r24, r14; All of these parts have movw.... Save 4 bytes
    rjmp .L11    ; skip the zeroing of the return registers...
.L13:
    ldi r22, 0   ; No pulse detected within timeout. Return zero.
    ldi r23, 0   ;
    movw r24, r22; saves a word vs 2 more ldi 0s
/* epilogue start */
.L11:
    pop r17      ;
    pop r16      ;
    pop r15      ;
    pop r14      ;
    pop r13      ;
    pop r12      ;
    ret          ;
